{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- [GaussianMixture 高斯混合模型概率分布](http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture)\n",
    "- [pairwise_distances_argmin 计算一点和一组点之间的最小距离](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances_argmin.html#sklearn.metrics.pairwise_distances_argmin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "from mpl_toolkits.mplot3d import Axes3D\n",
    "from scipy.stats import multivariate_normal\n",
    "\n",
    "from sklearn.mixture import GaussianMixture\n",
    "from sklearn.metrics.pairwise import pairwise_distances_argmin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 解决中文显示问题\n",
    "mpl.rcParams['font.sans-serif'] = [u'SimHei']\n",
    "mpl.rcParams['axes.unicode_minus'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 设置在jupyter中matplotlib的显示情况（默认inline是内嵌显示，通过设置为tk表示不内嵌显示）\n",
    "%matplotlib tk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "'''\n",
    "sklearn.mixture.GaussianMixture在0.18版本以前是sklearn.mixture.GMM，两者的参数基本类型，这里主要介绍GaussianMixture的相关参数\n",
    "属性参数：\n",
    "n_components：混合组合的个数，默认为1, 可以理解为聚类/分类数量\n",
    "covariance_type：给定协方差的类型，可选: full、tied、diag、spherical，默认为full；\n",
    "    full：每个组件都有自己的公用的协防差矩阵，tied：所有组件公用一个协方差矩阵，\n",
    "    diag：每个组件都有自己的斜对角协方差矩阵，spherical：每个组件都有自己单独的方差值\n",
    "tol：默认1e-3，收敛阈值，如果在迭代过程中，平均增益小于该值的时候，EM算法结束。\n",
    "reg_covar：协方差对角线上的非负正则化参数，默认为0\n",
    "max_iter：em算法的最大迭代次数，默认100\n",
    "n_init: 默认值1，执行初始化操作数量，该参数最好不要变动\n",
    "init_params：初始化权重值、均值以及精度的方法，参数可选：kmeans、random，默认kmeans；kmeans：使用kmeans算法进行初始化操作\n",
    "weights_init：初始化权重列表，如果没有给定，那么使用init_params参数给定的方法来进行创建，默认为None\n",
    "means_init：初始化均值列表，如果没有给定，那么使用init_params参数给定的方法来进行创建，默认为None\n",
    "precisions_init: 初始化精度列表，如果没有给定，那么使用init_params参数给定的方法来进行创建，默认为None\n",
    "warn_stat：默认为False，当该值为true的时候，在类似问题被多次训练的时候，可以加快收敛速度\n",
    "'''\n",
    "## 使用scikit携带的EM算法或者自己实现的EM算法\n",
    "def trainModel(style, x):\n",
    "    if style == 'sklearn':\n",
    "        # 对象创建\n",
    "        g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000, init_params='kmeans')\n",
    "        # 模型训练\n",
    "        g.fit(x)\n",
    "        # 效果输出\n",
    "        print ('类别概率:\\t', g.weights_[0])\n",
    "        print ('均值:\\n', g.means_, '\\n')\n",
    "        print ('方差:\\n', g.covariances_, '\\n')\n",
    "        mu1, mu2 = g.means_\n",
    "        sigma1, sigma2 = g.covariances_\n",
    "        # 返回数据\n",
    "        return (mu1, mu2, sigma1, sigma2)\n",
    "    else:\n",
    "        ## 自己实现一个EM算法\n",
    "        num_iter = 100\n",
    "        n, d = data.shape\n",
    "        \n",
    "        # 初始化均值和方差正定矩阵\n",
    "        mu1 = data.min(axis=0)\n",
    "        mu2 = data.max(axis=0)\n",
    "        sigma1 = np.identity(d)\n",
    "        sigma2 = np.identity(d)\n",
    "        pi = 0.5\n",
    "        \n",
    "        # 实现EM算法\n",
    "        for i in range(num_iter):\n",
    "            # E Step\n",
    "            norm1 = multivariate_normal(mu1, sigma1)\n",
    "            norm2 = multivariate_normal(mu2, sigma2)\n",
    "            tau1 = pi * norm1.pdf(data)\n",
    "            tau2 = (1 - pi) * norm2.pdf(data)\n",
    "            gamma = tau1 / (tau1 + tau2)\n",
    "            \n",
    "            # M Step\n",
    "            mu1 = np.dot(gamma, data) / np.sum(gamma)\n",
    "            mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma))\n",
    "            sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma)\n",
    "            sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma)\n",
    "            pi = np.sum(gamma) / n\n",
    "            \n",
    "            # 输出信息\n",
    "            j = i + 1\n",
    "            if j % 10 == 0:\n",
    "                print (j, \":\\t\", mu1, mu2)\n",
    "        \n",
    "        # 效果输出\n",
    "        print ('类别概率:\\t', pi)\n",
    "        print ('均值:\\t', mu1, mu2)\n",
    "        print ('方差:\\n', sigma1, '\\n\\n', sigma2, '\\n')\n",
    "        \n",
    "        # 返回结果\n",
    "        return (mu1, mu2, sigma1, sigma2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 创建模拟数据（3维数据）\n",
    "np.random.seed(28)\n",
    "N = 500\n",
    "M = 250\n",
    "\n",
    "## 根据给定的均值和协方差矩阵构建数据\n",
    "mean1 = (0, 0, 0)\n",
    "cov1 = np.diag((1, 2, 3))\n",
    "## 产生400条数据\n",
    "data1 = np.random.multivariate_normal(mean1, cov1, N)\n",
    "\n",
    "## 产生一个数据分布不均衡的数据集， 100条\n",
    "mean2 = (2, 2, 1)\n",
    "cov2 = np.array(((1, 1, 3), (1, 2, 1), (0, 0, 1)))\n",
    "data2 = np.random.multivariate_normal(mean2, cov2, M)\n",
    "\n",
    "## 合并data1和data2这两个数据集\n",
    "data = np.vstack((data1, data2))\n",
    "\n",
    "## 产生数据对应的y值\n",
    "y1 = np.array([True] * N + [False] * M)\n",
    "y2 = ~y1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 :\t [ 0.07675269 -0.01686764 -0.01737712] [ 2.04021233  2.1175765   0.8898225 ]\n",
      "20 :\t [ 0.05457846 -0.01657497  0.01556185] [ 1.99881444  2.024444    0.78550548]\n",
      "30 :\t [ 0.0493724  -0.01530053  0.01998181] [ 1.98194848  1.99372062  0.76630687]\n",
      "40 :\t [ 0.04769738 -0.0146461   0.02114908] [ 1.97617151  1.98312041  0.7605966 ]\n",
      "50 :\t [ 0.04713832 -0.0144041   0.02151916] [ 1.97421785  1.97952545  0.75872926]\n",
      "60 :\t [ 0.04695101 -0.01432056  0.0216412 ] [ 1.97356084  1.9783154   0.75810762]\n",
      "70 :\t [ 0.04688822 -0.01429228  0.02168189] [ 1.97334033  1.97790917  0.75789968]\n",
      "80 :\t [ 0.04686717 -0.01428277  0.02169551] [ 1.97326637  1.9777729   0.75783001]\n",
      "90 :\t [ 0.04686011 -0.01427957  0.02170007] [ 1.97324158  1.97772721  0.75780666]\n",
      "100 :\t [ 0.04685774 -0.0142785   0.0217016 ] [ 1.97323326  1.97771189  0.75779883]\n",
      "类别概率:\t 0.656477803179\n",
      "均值:\t [ 0.04685774 -0.0142785   0.0217016 ] [ 1.97323326  1.97771189  0.75779883]\n",
      "方差:\n",
      " [[ 1.01791148  0.0450431  -0.10519131]\n",
      " [ 0.0450431   2.16506287  0.02188142]\n",
      " [-0.10519131  0.02188142  2.5601983 ]] \n",
      "\n",
      " [[ 0.8726692   1.17431899  1.20262496]\n",
      " [ 1.17431899  2.34533766  1.53476534]\n",
      " [ 1.20262496  1.53476534  3.6195891 ]] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "## 预测结果(得到概率密度值)\n",
    "style = 'sklearn'\n",
    "style = 'self'\n",
    "mu1, mu2, sigma1, sigma2 = trainModel(style, data)\n",
    "# 预测分类（根据均值和方差对原始数据进行概率密度的推测）\n",
    "norm1 = multivariate_normal(mu1, sigma1)\n",
    "norm2 = multivariate_normal(mu2, sigma2)\n",
    "tau1 = norm1.pdf(data)\n",
    "tau2 = norm2.pdf(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "距离: [0 1]\n",
      "准确率：88.53%\n"
     ]
    }
   ],
   "source": [
    "## 计算均值的距离，然后根据距离得到分类情况\n",
    "dist = pairwise_distances_argmin([mean1, mean2], [mu1, mu2], metric='euclidean')\n",
    "print (\"距离:\", dist)\n",
    "if dist[0] == 0:\n",
    "    c1 = tau1 > tau2\n",
    "else:\n",
    "    c1 = tau1 < tau2\n",
    "c2 = ~c1\n",
    "\n",
    "## 计算准备率\n",
    "acc = np.mean(y1 == c1)\n",
    "print (u'准确率：%.2f%%' % (100*acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## 画图\n",
    "fig = plt.figure(figsize=(12, 6), facecolor='w')\n",
    "\n",
    "## 添加一个子图，设置为3d的\n",
    "ax = fig.add_subplot(121, projection='3d')\n",
    "## 点图\n",
    "ax.scatter(data[y1, 0], data[y1, 1], data[y1, 2], c='r', s=30, marker='o', depthshade=True)\n",
    "ax.scatter(data[y2, 0], data[y2, 1], data[y2, 2], c='g', s=30, marker='^', depthshade=True)\n",
    "## 标签\n",
    "ax.set_xlabel('X')\n",
    "ax.set_ylabel('Y')\n",
    "ax.set_zlabel('Z')\n",
    "## 标题\n",
    "ax.set_title(u'原始数据', fontsize=16)\n",
    "\n",
    "## 添加一个子图，设置为3d\n",
    "ax = fig.add_subplot(122, projection='3d')\n",
    "# 画点\n",
    "ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', depthshade=True)\n",
    "ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', depthshade=True)\n",
    "# 设置标签\n",
    "ax.set_xlabel('X')\n",
    "ax.set_ylabel('Y')\n",
    "ax.set_zlabel('Z')\n",
    "# 设置标题\n",
    "ax.set_title(u'EM算法分类', fontsize=16)\n",
    "\n",
    "# 设置总标题\n",
    "plt.suptitle(u'EM算法的实现,准备率：%.2f%%' % (acc * 100), fontsize=20)\n",
    "plt.subplots_adjust(top=0.90)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
